FIN 7821 Assignment1¶

In [47]:
import pandas as pd
import numpy as np

Question 1¶

In [55]:
#read dataset as df
df = pd.read_csv("NCES_CCD_extract.csv")
In [56]:
column_names = df.columns.tolist()
print(column_names)
['Agency Name', 'State Name [District] Latest available year', 'State Abbr [District] Latest available year', 'Agency ID - NCES Assigned [District] Latest available year', 'Hispanic Students [District] 2019-20', 'Hispanic Students [District] 2018-19', 'Hispanic Students [District] 2017-18', 'Hispanic Students [District] 2016-17', 'Hispanic Students [District] 2015-16', 'Hispanic Students [District] 2014-15', 'Black or African American Students [District] 2019-20', 'Black or African American Students [District] 2018-19', 'Black or African American Students [District] 2017-18', 'Black or African American Students [District] 2016-17', 'Black or African American Students [District] 2015-16', 'Black or African American Students [District] 2014-15', 'White Students [District] 2019-20', 'White Students [District] 2018-19', 'White Students [District] 2017-18', 'White Students [District] 2016-17', 'White Students [District] 2015-16', 'White Students [District] 2014-15', 'Fall Membership (V33) [District Finance] 2017-18', 'Fall Membership (V33) [District Finance] 2016-17', 'Fall Membership (V33) [District Finance] 2015-16', 'Fall Membership (V33) [District Finance] 2014-15', 'Total General Revenue (TOTALREV) [District Finance] 2017-18', 'Total General Revenue (TOTALREV) [District Finance] 2016-17', 'Total General Revenue (TOTALREV) [District Finance] 2015-16', 'Total General Revenue (TOTALREV) [District Finance] 2014-15', 'Total Expenditures (TOTALEXP) [District Finance] 2017-18', 'Total Expenditures (TOTALEXP) [District Finance] 2016-17', 'Total Expenditures (TOTALEXP) [District Finance] 2015-16', 'Total Expenditures (TOTALEXP) [District Finance] 2014-15']
  • Rename the variables to have sensible names
In [57]:
# Rename for easy use
df1 = df.rename(columns={'Agency Name':'distname',\
                        'State Name [District] Latest available year':'stname', 'State Abbr [District] Latest available year': 'stabbr', \
                         'Agency ID - NCES Assigned [District] Latest available year':'distid', 'Hispanic Students [District] 2019-20':'hisp2019', \
                          'Hispanic Students [District] 2018-19':'hisp2018','Hispanic Students [District] 2017-18':'hisp2017', \
                        'Hispanic Students [District] 2016-17':'hisp2016', 'Hispanic Students [District] 2015-16':'hisp2015', \
                          'Hispanic Students [District] 2014-15':'hisp2014', 'Black or African American Students [District] 2019-20':'black2019', \
                        'Black or African American Students [District] 2018-19':'black2018', 'Black or African American Students [District] 2017-18':'black2017',\
                        'Black or African American Students [District] 2016-17':'black2016', 'Black or African American Students [District] 2015-16':'black2015', 
                        'Black or African American Students [District] 2014-15':'black2014',  'White Students [District] 2019-20':'white2019', \
                         'White Students [District] 2018-19':'white2018', 'White Students [District] 2017-18':'white2017', \
                        'White Students [District] 2016-17':'white2016', 'White Students [District] 2015-16':'white2015', \
                        'White Students [District] 2014-15':'white2014', 'Fall Membership (V33) [District Finance] 2017-18': 'mem2017', \
                        'Fall Membership (V33) [District Finance] 2016-17': 'mem2016', 'Fall Membership (V33) [District Finance] 2015-16': 'mem2015', \
                        'Fall Membership (V33) [District Finance] 2014-15': 'mem2014', 'Total General Revenue (TOTALREV) [District Finance] 2017-18':'totrev2017',\
                        'Total General Revenue (TOTALREV) [District Finance] 2016-17':'totrev2016', 'Total General Revenue (TOTALREV) [District Finance] 2015-16':'totrev2015', \
                        'Total General Revenue (TOTALREV) [District Finance] 2014-15':'totrev2014', 'Total Expenditures (TOTALEXP) [District Finance] 2017-18':'totexp2017', \
                        'Total Expenditures (TOTALEXP) [District Finance] 2016-17':'totexp2016', 'Total Expenditures (TOTALEXP) [District Finance] 2015-16':'totexp2015', \
                        'Total Expenditures (TOTALEXP) [District Finance] 2014-15':'totexp2014'})
In [58]:
df1_column_names = df1.columns.tolist()
print(df1_column_names)
['distname', 'stname', 'stabbr', 'distid', 'hisp2019', 'hisp2018', 'hisp2017', 'hisp2016', 'hisp2015', 'hisp2014', 'black2019', 'black2018', 'black2017', 'black2016', 'black2015', 'black2014', 'white2019', 'white2018', 'white2017', 'white2016', 'white2015', 'white2014', 'mem2017', 'mem2016', 'mem2015', 'mem2014', 'totrev2017', 'totrev2016', 'totrev2015', 'totrev2014', 'totexp2017', 'totexp2016', 'totexp2015', 'totexp2014']
  • Destring the variables that should be numeric
In [59]:
# Destring the variables that should be numeric
numeric_columns = ['distid','hisp2019', 'hisp2018', 'hisp2017', 'hisp2016', 'hisp2015', 'hisp2014',
                   'black2019', 'black2018', 'black2017', 'black2016', 'black2015', 'black2014',
                   'white2019', 'white2018', 'white2017', 'white2016', 'white2015', 'white2014',
                   'mem2017', 'mem2016', 'mem2015', 'mem2014',
                   'totrev2017', 'totrev2016', 'totrev2015', 'totrev2014',
                   'totexp2017', 'totexp2016', 'totexp2015', 'totexp2014']

# copy the dataframe to df2 for future convenience
df2 = df1.copy()
df2[numeric_columns] = df2[numeric_columns].apply(pd.to_numeric, errors='coerce')
  • Drop the missing records in numerical cols
In [60]:
missing_data = df2[numeric_columns].isna()
df3 = df2.dropna(subset=numeric_columns)
In [61]:
df4 = df3.melt(id_vars=['distid', 'distname', 'stname', 'stabbr'], var_name='name_year')

df4[['name', 'year']] = df4['name_year'].str.extract(r'([A-Za-z]+)([0-9]+)')
df4 = df4.pivot_table(index=['distid', 'distname', 'stname', 'stabbr', 'year'], columns='name', values='value', aggfunc='sum').reset_index()
In [62]:
def describe_data(data):
    num_observations, num_variables = data.shape

    variable_types = data.dtypes
    print(f"Observations: {num_observations}")
    print(f"Variables: {num_variables}")
    print("\nVariable Storage Display")
    print(f"name\ttype\tformat")

    for variable_name, variable_type in variable_types.items():
        variable_format = ''
        if variable_type == 'int64':
            variable_format = '%10.0g'
        elif variable_type == 'float64':
            variable_format = '%10.0g'
        elif variable_type == 'object':
            variable_format = f'%{max(len(variable_name), 9)}s'

        print(f"{variable_name}\t{variable_type}\t{variable_format}")
        
describe_data(df4)
Observations: 59370
Variables: 11

Variable Storage Display
name	type	format
distid	float64	%10.0g
distname	object	%9s
stname	object	%9s
stabbr	object	%9s
year	object	%9s
black	float64	%10.0g
hisp	float64	%10.0g
mem	float64	%10.0g
totexp	float64	%10.0g
totrev	float64	%10.0g
white	float64	%10.0g

Question 2¶

  • Per-pupil revenue and per-pupil expenditures
In [63]:
# Question 2 - Per-pupil revenue and per-pupil expenditures
df5 = df4.copy()
df5['pupil_per_rev'] = df4['totrev'] / df4['mem']
df5['pupil_per_exp'] = df4['totexp'] / df4['mem']
  • Create log of per pupil revenues and expenditures by taking the log of each respective variable.
In [64]:
df6 = df5.copy()
df6['pupil_per_rev_log'] = np.log1p(df5['pupil_per_rev'])
df6['pupil_per_exp_log'] = np.log1p(df5['pupil_per_exp'])
  • Create the share of the student body that is either Black or Hispanic by adding Black and Hispanic enrollments together and dividing by total membership.
In [65]:
df7 = df6.copy()
df7['black_hisp_total'] = df7['black'] + df7['hisp']
df7['black_hisp_share'] = df7['black_hisp_total'] / df7['mem']

Question 3¶

  • Visualize the correlation between Black/Hispanic enrollment share and log of total revenues and log of total expenditures.
In [66]:
df8 = df7.copy()
df8['totrev_log'] = np.log1p(df8['totrev'])
df8['totexp_log'] = np.log1p(df8['totexp'])

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.scatter(df8['black_hisp_share'], df8['totrev_log'], alpha=0.5)
plt.title('Correlation between Black/Hispanic Enrollment Share and Log of Total Revenues')
plt.xlabel('Black/Hispanic Enrollment Share')
plt.ylabel('Log of Total Revenues')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 5))
plt.scatter(df8['black_hisp_share'], df8['totexp_log'], alpha=0.5)
plt.title('Correlation between Black/Hispanic Enrollment Share and Log of Total Expenditures')
plt.xlabel('Black/Hispanic Enrollment Share')
plt.ylabel('Log of Total Expenditures')
plt.grid(True)
plt.show()

Question 4¶

In a single figure, plot both the average log of total per pupil revenue and log of total per pupil expenditures against the share of Black/Hispanic student quantiles.

In [67]:
# Question 4 

df9=df8.copy()
df9['quantile'] = pd.qcut(df9['black_hisp_share'], q=100, labels=False)

quantile_stats = df9.groupby('quantile').agg({
    'totrev_log': 'mean',
    'totexp_log': 'mean'
}).reset_index()

import matplotlib.pyplot as plt
quantile_stats_array = quantile_stats.to_numpy()

plt.figure(figsize=(10, 5))
plt.plot(quantile_stats_array[:, 0], quantile_stats_array[:, 1], label='Avg Log Total Per Pupil Revenue')
plt.plot(quantile_stats_array[:, 0], quantile_stats_array[:, 2], label='Avg Log Total Per Pupil Expenditures')
plt.title('Average Log Total Per Pupil Revenue and Expenditures by Black/Hispanic Student Quantiles')
plt.xlabel('Quantiles of Black/Hispanic Student Share')
plt.ylabel('Average Log Value')
plt.legend()
plt.grid(True)
plt.show()
  • In a single figure, plot both the average log of total per pupil revenue and log of total

per pupil expenditures against the share of Black/Hispanic student quantiles. What patterns emerge?

  • Answer: The per pupil expenditure and expenditures have positive correlation with black/hispanic student share, as the black/hispanic share goes high, the expenditures and revenues going up as well."

Question 5¶

  • Plot the state average total per-pupil expenditures against the state average Black/Hispanic enrollment share and label each point with the corresponding state two-letter abbreviation.
In [68]:
# Group the data by state (stabbr)
state_groups = df9.groupby('stabbr')

# Calculate the average Black/Hispanic share and per-pupil expenditures for each state
state_summary = state_groups.agg({
    'black': 'mean',
    'hisp': 'mean',
    'black_hisp_share':'mean',
    'pupil_per_exp': 'mean'
}).reset_index()

state_summary.head()
Out[68]:
name stabbr black hisp black_hisp_share pupil_per_exp
0 DC 1832.826923 493.448718 0.758782 26269.471453
1 DE 1087.900901 617.981982 0.469254 15404.021295
2 FL 9283.910448 13691.378109 0.382428 10569.113415
3 GA 3497.125683 1473.353370 0.431537 11374.751516
4 HI 3189.666667 24591.666667 0.143884 15281.662068
In [69]:
# Rename the columns if needed
state_summary.rename(columns={'black_hisp_share': 'Avg_black_hist_Share', 'pupil_per_exp': 'Avg_Per_Pupil_Exp'}, inplace=True)

# Plot the scatter plot with the x-axis as the combined avg Black/Hispanic share and the y-axis as the average per-pupil expenditures
plt.scatter(state_summary['Avg_black_hist_Share'], state_summary['Avg_Per_Pupil_Exp'], alpha=0.5)

# Add labels with two-letter state abbreviations
for i, txt in enumerate(state_summary['stabbr']):
    plt.annotate(txt, (state_summary['Avg_black_hist_Share'].iloc[i], state_summary['Avg_Per_Pupil_Exp'].iloc[i]))

# Add axis labels and a title
plt.xlabel('Average Black/Hispanic Enrollment Share')
plt.ylabel('Average Per-Pupil Expenditures')
plt.title('Relationship between State-Average Student Expenditures and Combined Black/Hispanic Enrollment Share')

# Display the plot
plt.grid(True)  
In [70]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import contextily as ctx 
import geopandas as gpd 
import os 
from mpl_toolkits.axes_grid1 import make_axes_locatable
In [71]:
# Get stats
# Retrive the official shapefiles maps from the US Census Bureau 
# link: https://www.census.gov/cgi-bin/geo/shapefiles/index.php
path = "tl_2022_us_state/tl_2022_us_state.shp"
gdf = gpd.read_file(path)
gdf = gdf.to_crs("EPSG:4326")
In [35]:
gdf.head(1)
Out[35]:
REGION DIVISION STATEFP STATENS GEOID STUSPS NAME LSAD MTFCC FUNCSTAT ALAND AWATER INTPTLAT INTPTLON geometry
0 3 5 54 01779805 54 WV West Virginia 00 G4000 A 62266456923 489045863 +38.6472854 -080.6183274 POLYGON ((-77.75438 39.33346, -77.75422 39.333...
In [36]:
# basic plot
gdf.plot()
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd20f26e280>
In [37]:
# removal of non_continetal states for better visualization
non_continental = ['HI','VI','MP','GU','AK','AS','PR']
us49 = gdf
for n in non_continental:
    us49 = us49[us49.STUSPS != n]
In [38]:
# Simple map for US States based on the downloaded file
f,ax = plt.subplots(1,1, figsize=(8,6), sharex=True, sharey=True, dpi=300)
plt.title('Simple Map of US States')
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%",pad=0,alpha=0.5)
us49.plot('ALAND', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', legend=True, cax=cax, linewidth=0.1)
plt.show()
In [34]:
# Starting combine the data we have processed with the States data with geo information
# Checks for state_summary(done in previous questions)
state_summary.head(3)
Out[34]:
stabbr black hisp Avg_black_hist_Share Avg_Per_Pupil_Exp
0 DC 1832.826923 493.448718 0.758782 26269.471453
1 DE 1087.900901 617.981982 0.469254 15404.021295
2 FL 9283.910448 13691.378109 0.382428 10569.113415
In [35]:
# Combine the data with geo information
import geopandas as gpd
import matplotlib.pyplot as plt

my_data = state_summary[['stabbr', 'Avg_black_hist_Share']]
merged = us49.merge(my_data, left_on='STUSPS', right_on='stabbr', how='inner')

# Filter out non-continental states
non_continental = ['HI', 'VI', 'MP', 'GU', 'AK', 'AS', 'PR']
merged = merged[~merged['STUSPS'].isin(non_continental)]

# Handle missing values in the specified column (e.g., 'Avg_black_hist_Share')
# Replace NaN values with a default value (e.g., 0)
column_to_plot = 'Avg_black_hist_Share'
default_value = 0

merged[column_to_plot].fillna(default_value, inplace=True)  # Replace NaN with the default value

plt.show()
In [36]:
data_types = my_data.dtypes
print(data_types)
stabbr                   object
Avg_black_hist_Share    float64
dtype: object
In [37]:
# Remove leading/trailing whitespaces and convert to uppercase
us49['STUSPS'] = us49['STUSPS'].str.strip().str.upper()
my_data['stabbr'] = my_data['stabbr'].str.strip().str.upper()

# Perform an inner join
df_join = us49.merge(my_data, left_on='STUSPS', right_on='stabbr', how='inner')
df_join.head(3)
<ipython-input-37-ffcc16cb307c>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_data['stabbr'] = my_data['stabbr'].str.strip().str.upper()
Out[37]:
REGION DIVISION STATEFP STATENS GEOID STUSPS NAME LSAD MTFCC FUNCSTAT ALAND AWATER INTPTLAT INTPTLON geometry stabbr Avg_black_hist_Share
0 3 5 54 01779805 54 WV West Virginia 00 G4000 A 62266456923 489045863 +38.6472854 -080.6183274 POLYGON ((-77.75438 39.33346, -77.75422 39.333... WV 0.036465
1 3 5 12 00294478 12 FL Florida 00 G4000 A 138962819934 45971472526 +28.3989775 -082.5143005 MULTIPOLYGON (((-83.10874 24.62949, -83.10711 ... FL 0.382428
2 2 3 17 01779784 17 IL Illinois 00 G4000 A 143778515726 6216539665 +40.1028754 -089.1526108 POLYGON ((-87.89243 38.28285, -87.89334 38.282... IL 0.248599
In [38]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

def StatesPlot(df, data, cmap):
    fig, ax = plt.subplots(1, 1, figsize=(15, 10), sharex=True, sharey=True, dpi=300)
    fig.tight_layout()
    plt.title('United States Map - Variable = ' + data)
    ax.set_axis_off()
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="3%", pad=0.5, alpha=0.5)
    
    # Plot the data using the specified colormap
    df.plot(data, ax=ax, alpha=0.5, cmap=cmap, edgecolor='k', legend=True, cax=cax, linewidth=0.1)
    
    # Customize the colorbar label and appearance
    cax.set_title(data, fontsize=12)
    
    plt.show()

# Example usage with df_join:
StatesPlot(df_join, 'Avg_black_hist_Share', 'YlOrRd')

Question 6¶

  • Example 1, using data from OpenNYC
In [251]:
# Specify the name that is used to seach for the data
import geopandas as gpd
import matplotlib.pyplot as plt

shapefile_path = 'Green_Cart_Bnd_2008_DOHMH_2010/Green_Cart_Bnd_2008_DOHMH_2010.shp'

gdf_states_ny = gpd.read_file(shapefile_path)

# gdf_states_ny.head()
gdf_states_ny.plot()
plt.title('Map of Your Dataset')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()
In [271]:
gdf_states_ny.head()
Out[271]:
BOROCODE COUNT_PREC FIRST_BORO FIRST_BO_1 INCL_PREC geometry
0 1 8 1 Manhattan 23, 25, 26, 28, 30, 32, 33, 34\r\n33, and 34. MULTIPOLYGON (((1007642.105 227756.197, 100742...
1 2 10 2 The Bronx 40, 41, 42, 44, 46, 47, 48, 49, 52 MULTIPOLYGON (((1012821.806 229228.265, 101278...
2 3 9 3 Brooklyn 67, 70, 71, 73, 75, 77, 81, 83 MULTIPOLYGON (((1027452.041 156400.814, 102683...
3 4 4 4 Queens 100, 101, 103, 113 MULTIPOLYGON (((1032066.245 153833.959, 103147...
4 5 1 5 Staten Island 120 MULTIPOLYGON (((968962.134 160164.304, 969308....
In [270]:
# Map of NYC
f, ax = plt.subplots(1, 1, figsize=(12, 10), sharex=True, sharey=True, dpi=300)
gdf_states_ny.plot('FIRST_BO_1', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', legend=True, linewidth=0.1)
plt.title('Map of NYC')

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%", pad=0.1)

gdf_states_ny.plot('FIRST_BO_1', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', cax=cax, linewidth=0.1)
plt.show()
  • Example 2 - using data from the official shapefiles maps from the US Census Bureau (same offical web to download the US states data), this time select Block group -> Select Utah
  • link: https://www.census.gov/cgi-bin/geo/shapefiles/index.php
In [267]:
import geopandas as gpd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

shapefile_path = 'tl_2022_49_tabblock20/tl_2022_49_tabblock20.shp'
gdf_states_ut = gpd.read_file(shapefile_path)

f, ax = plt.subplots(1, 1, figsize=(12, 10), sharex=True, sharey=True, dpi=300)
gdf_states_ut.plot('ALAND20', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', legend=True, linewidth=0.1)
plt.title('Map of Utah')

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%", pad=0.1)

gdf_states_ut.plot('ALAND20', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', cax=cax, linewidth=0.1)
plt.show()
  • Using Open Street Map API for further visualization
In [31]:
import geopandas as gpd
import osmnx as ox
import matplotlib.pyplot as plt
In [28]:
# Specify the name that is used to seach for the data
place_name = "Salt Lake City, Utah, USA"

# Get place boundary related to the place name as a geodataframe
area = ox.geocode_to_gdf(place_name)

# Check the data type
area
Out[28]:
geometry bbox_north bbox_south bbox_east bbox_west place_id osm_type osm_id lat lon class type place_rank importance addresstype name display_name
0 POLYGON ((-112.10139 40.82715, -112.10132 40.8... 40.853391 40.699926 -111.740484 -112.101392 315499582 relation 198770 40.75962 -111.886797 boundary administrative 16 0.637243 city Salt Lake City Salt Lake City, Salt Lake County, Utah, United...
In [29]:
# List key-value pairs for tags
tags = {'building': True}   

buildings = ox.geometries_from_place(place_name, tags)
buildings.head()
/var/folders/q5/_56gtjhj19s4q7xtbc4g0chc0000gn/T/ipykernel_13851/2857596332.py:4: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release.
  buildings = ox.geometries_from_place(place_name, tags)
Out[29]:
building ele gnis:Class gnis:County gnis:County_num gnis:ST_alpha gnis:ST_num gnis:feature_id name geometry ... tower:type passports payment:paypal area service:vehicle:painting substation construction:aeroway faculty ways type
element_type osmid
node 150941193 yes 1337 Populated Place Salt Lake 035 UT 49 1449753 Tara Condominium POINT (-111.89322 40.77525) ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
150945663 yes 1325 Populated Place Salt Lake 035 UT 49 1449754 Trevi Towers Condominium POINT (-111.89311 40.77462) ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
150948580 yes 1464 Populated Place Salt Lake 035 UT 49 1449779 Northpoint Estates Condominium POINT (-111.87954 40.78641) ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
150954906 yes 1288 Populated Place Salt Lake 035 UT 49 1449799 Villa Montaige Condominium POINT (-111.93105 40.76022) ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
150954929 residential 1357 Populated Place Salt Lake 035 UT 49 1454367 The Avenues Heritage Condominium POINT (-111.87124 40.77219) ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 334 columns

In [30]:
# Plot footprints 
buildings.plot()
Out[30]:
<Axes: >